Basic Descriptives
Ns
nrow(articles)
## [1] 13136
nrow(sources)
## [1] 77454
nrow(authors)
## [1] 1842
Articles
Number Over Time By outlet
articles %>%
group_by(year, source) %>%
summarize(n = n()) %>%
ggplot(aes(x = year, y = n, color = source)) +
geom_line() +
theme_bw() +
scale_x_continuous(breaks = c(2012, 2014, 2016, 2018, 2020, 2022)) +
labs(
x = "Year",
y = "Number of Articles",
color = "Newspaper"
) +
theme_min

ggsave(here("paper/figures/time_trend_n_articles.png"), width = 6, height = 4)
Sources
Number per Article
articles$n_srcs %>% hist()

summary(articles$n_srcs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 6.000 5.896 8.000 31.000
Number per Article over Time
sources %>%
group_by(year) %>%
summarize(n = n(), n_articles = length(unique(filename)), avg_sources = n / n_articles) %>%
ggplot(aes(x = year, y = avg_sources)) +
geom_line() +
scale_x_continuous(breaks = seq(2012, 2022, 2)) +
theme_min

etable(feols(n_srcs ~ post2016, data=articles)) %>% htmlTable
|
|
|
feols(n_srcs ~ p..
|
|
1
|
Dependent Var.:
|
n_srcs
|
|
2
|
|
|
|
3
|
Constant
|
5.272*** (0.0492)
|
|
4
|
post2016TRUE
|
0.8374*** (0.0570)
|
|
5
|
_______________
|
__________________
|
|
6
|
S.E. type
|
IID
|
|
7
|
Observations
|
13,136
|
|
8
|
R2
|
0.01617
|
|
9
|
Adj. R2
|
0.01610
|
Number per Outlet
sources %>%
group_by(source) %>%
summarize(n = n(), n_articles = length(unique(filename)), avg_sources = n / n_articles) %>%
kable()
| Chicago Tribune |
3052 |
578 |
5.280277 |
| Los Angeles Times |
10173 |
1796 |
5.664254 |
| New York Times |
28282 |
4671 |
6.054806 |
| USA Today |
3397 |
684 |
4.966374 |
| Wall Street Journal |
12081 |
2286 |
5.284777 |
| Washington Post |
20469 |
3121 |
6.558475 |
sources %>%
group_by(source, year) %>%
summarize(n = n(), n_articles = length(unique(filename)), avg_sources = n / n_articles) %>%
ggplot(aes(x = year, y = avg_sources, color = source)) +
geom_line() +
theme_min

# The example articles
# filter(str_detect(filename, "1270322870.xml|1731405273.xml")) %>%
Source Types (Table 1)
gen.cats <- table(sources$category) %>%
prop.table() %>%
as.data.frame() %>%
rename(Category = "Var1") %>%
mutate(
Freq = paste0(round(100 * Freq, digits = 1), "%")
)
sources$category2 <- case_when(
sources$env_category == "environmental" ~ paste(sources$category, "- Environmental"),
sources$env_category == "fossil fuel" ~ paste(sources$category, "- Fossil Fuel"),
sources$category == "Politician" & sources$pol_party == "Democrat" ~ "Politician - Democrat",
sources$category == "Politician" & sources$pol_party == "Republican" ~ "Politician - Republican",
sources$category == "Politician" & sources$category == "Politician" ~ "Politician - International",
sources$category == "Citizen" ~ "Other",
sources$category == "Advocacy" ~ "Advocacy - Other",
sources$category == "Business" ~ "Business - Other",
TRUE ~ sources$category
)
sub.cats <- table(sources$category2) %>%
prop.table() %>%
as.data.frame() %>%
rename(Category = "Var1") %>%
mutate(
Freq = paste0(round(100 * Freq, digits = 1), "%")
)
rbind(gen.cats, sub.cats) %>% distinct() %>% kable()
| Academic |
10.2% |
| Advocacy |
21.5% |
| Bureaucrat |
12.4% |
| Business |
15% |
| International |
11.3% |
| Media |
5.6% |
| Other |
4.1% |
| Politician |
19.8% |
| Advocacy - Environmental |
13.7% |
| Advocacy - Fossil Fuel |
0.4% |
| Advocacy - Other |
7.5% |
| Business - Environmental |
2% |
| Business - Fossil Fuel |
4.3% |
| Business - Other |
8.6% |
| Politician - Democrat |
12% |
| Politician - International |
2.8% |
| Politician - Republican |
5% |
Distribution of source types by newspaper
sources %>%
mutate(
category = factor(category, levels = c(
"Academic",
"Advocacy",
"Business",
"Bureaucrat",
"Politician",
"International",
"Media",
"Other"
))
) %>%
group_by(category, source) %>%
summarize(n = n()) %>%
group_by(source) %>%
mutate(prop = n / sum(n)) %>%
ungroup() %>%
ggplot(
aes(x = reorder(source, n), fill = fct_rev(category), y = prop)
) +
geom_bar(stat = "identity") +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "bottom") +
labs(x = "", y = "Prop", fill = "Category") +
scale_fill_brewer(palette = "Set2", direction = -1) +
coord_flip() +
guides(fill = guide_legend(reverse = TRUE))

ggsave(here("paper/figures/source-dist.png"), width = 8, height = 6)
sources.s2 <- sources %>%
mutate(category.slant2 = case_when(
category.slant %in% c("Bureaucrat", "International", "Politician") ~ "Other",
category.slant %in% c("Academic", "Media", "Advocacy") ~ "Other",
TRUE ~ category.slant
)) %>%
mutate(
category.slant2 = factor(category.slant2, levels = c(
"Environmental",
"Business",
"Fossil Fuel",
"Democrat",
"Republican",
"Other"
))
)
sources.s2 %>%
group_by(category.slant2, source) %>%
summarize(n = n()) %>%
group_by(source) %>%
mutate(prop = n / sum(n)) %>%
ungroup() %>%
ggplot(
aes(x = reorder(source, n), fill = fct_rev(category.slant2), y = prop)
) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c(
"Environmental" = "#91bfdb",
"Democrat" = "#4575b4",
"Republican" = "#d73027",
"Fossil Fuel" = "#fc8d59",
"Business" = "#fee090",
"Other" = "#ffffbf"
)) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "bottom") +
labs(x = "", y = "Prop", fill = "Category") +
coord_flip() +
guides(fill = guide_legend(reverse = TRUE))

ggsave(here("paper/figures/source-dist-2.png"), width = 8, height = 6)
sources.s2$category.slant2 %>%
table() %>%
prop.table() %>%
kable
| Environmental |
0.1536783 |
| Business |
0.0844501 |
| Fossil Fuel |
0.0465050 |
| Democrat |
0.1446923 |
| Republican |
0.0551811 |
| Other |
0.5154931 |
DIME correlates to category
feols(cfscore ~ category.slant, data = sources) %>%
etable %>%
htmlTable
|
|
|
.
|
|
1
|
Dependent Var.:
|
cfscore
|
|
2
|
|
|
|
3
|
Constant
|
0.8953*** (0.0096)
|
|
4
|
category.slantAdvocacy
|
-0.9919*** (0.0152)
|
|
5
|
category.slantBusiness
|
-0.7901*** (0.0144)
|
|
6
|
category.slantDemocrat
|
-1.468*** (0.0117)
|
|
7
|
category.slantEnvironmental
|
-1.716*** (0.0130)
|
|
8
|
category.slantFossilFuel
|
-0.5482*** (0.0152)
|
|
9
|
category.slantInternational
|
-1.378*** (0.0856)
|
|
10
|
category.slantOther
|
-0.9430*** (0.0239)
|
|
11
|
___________________________
|
___________________
|
|
12
|
S.E. type
|
IID
|
|
13
|
Observations
|
23,876
|
|
14
|
R2
|
0.50244
|
|
15
|
Adj. R2
|
0.50229
|
Top Source DIME scores
dt <- sources %>%
filter(cfscore_src != "politician") %>%
filter(!is.na(org_id)) %>%
group_by(org_id, cfscore.i2) %>%
summarize(
n = n(), cfscore = fmode(cfscore), name = first(organization_name), category.slant = first(category.slant)
) %>%
arrange(desc(n)) %>%
filter(!is.na(cfscore)) %>%
head(50) %>%
arrange(n) %>%
mutate(rn = 51 - row_number(), n_size = n + 10)
dt %>% ggplot(aes(x = cfscore, size = sqrt(n), y = n, color = cfscore)) +
theme_bw() +
scale_y_log10() +
scale_x_continuous(limits = c(-1.5, 1.2)) +
scale_color_gradient(low = "blue", high = "red") +
geom_vline(xintercept = 0, lty = "dashed") +
theme(panel.grid = element_blank(), axis.text.y = element_blank()) +
labs(x = "DIME Ideology", y = "") +
geom_text_repel(
data = subset(dt, category.slant != "other" & !(name %in% c("The Associated Press", "Associated Press", "California"))),
aes(x = cfscore, label = name),
direction = "y",
min.segment.length = 999
) +
guides(size = "none", color = "none", fill = "none")

ggsave(here("paper/figures/dime-scatter-100.png"), width = 10, height = 6)
dt %>% ggplot(aes(x = cfscore.i2, size = sqrt(n), y = n, color = cfscore.i2)) +
theme_bw() +
scale_y_log10() +
scale_x_continuous(limits = c(-1.5, 1.2)) +
scale_color_gradient(low = "blue", high = "red") +
geom_vline(xintercept = 0, lty = "dashed") +
theme(panel.grid = element_blank(), axis.text.y = element_blank()) +
labs(x = "DIME Ideology", y = "") +
geom_text_repel(
data = subset(dt, category.slant != "other" & !(name %in% c("The Associated Press", "Associated Press", "California"))),
aes(x = cfscore.i2, label = name),
direction = "y",
min.segment.length = 999
) +
guides(size = "none", color = "none", fill = "none")

ggsave(here("paper/figures/dime-scatter-100-i2.png"), width = 10, height = 6)
Top DIME over time
dt7 <- dt %>% arrange(desc(n)) %>% head(7)
sources %>%
filter(org_id %in% dt7$org_id) %>%
group_by(year, organization_name, org_id, cfscore) %>%
summarize(n = n()) %>%
ggplot(aes(
x = year, y = n, color = cfscore < 0, lty = organization_name
)) +
geom_smooth(se = F) +
theme_min

Politicians
pres <- sources %>%
mutate(
pres = case_when(
str_detect(person_name, "Biden") ~ "Biden",
str_detect(person_name, "Obama") ~ "Obama",
str_detect(person_name, "Trump") ~ "Trump",
category.slant %in% c("Democrat", "Republican") ~ category.slant,
TRUE ~ NA
)
)
npy <- sources %>%
group_by(year) %>%
summarize(npy = n())
ggarrange(plotlist = list(
pres %>%
group_by(year, pres) %>%
filter(!is.na(pres)) %>%
summarize(n = n()) %>%
ggplot(aes(x = year, y = n, color = pres)) +
geom_line() +
theme_min,
# Normalize by # of sources per year...
pres %>%
group_by(year, pres) %>%
filter(!is.na(pres)) %>%
summarize(n = n()) %>%
left_join(npy) %>%
ggplot(aes(x = year, y = n / npy, color = pres, shape = pres)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = seq(2012, 2022, 4)) +
theme_min
), common.legend = T, legend = "bottom")

Journalists
Demographics
ggarrange(plotlist = list(
authors %>% ggplot(aes(x = age_est_2017)) +
geom_histogram(),
authors %>% ggplot(aes(x = elite_undergrad_ivyplus)) +
geom_bar(stat = "count"),
authors %>% ggplot(aes(x = edu.has_postgrad)) +
geom_bar(stat = "count"),
authors %>% ggplot(aes(x = field.journo)) +
geom_bar(stat = "count"),
authors %>% ggplot(aes(x = gender)) +
geom_bar(stat = "count"),
authors %>% ggplot(aes(x = race.nonwhite)) +
geom_bar(stat = "count")
))

Demographics over time
Unit of analysis: unique journalists per year
journo.years <- articles %>%
group_by(author_name, year, elite_undergrad_ivyplus, edu.undergrad, edu.has_postgrad, is_career, field.journo, age_est, gender, race.nonwhite) %>%
summarize(n = n()) %>%
filter(!is.na(author_name))
age.plt <- journo.years %>%
ggplot(aes(x = year, y = as.numeric(age_est > 35))) +
geom_smooth(method = "loess") +
labs(x = "Year", y = "Age > 35")
ivy.plt <- journo.years %>%
ggplot(aes(x = year, y = as.numeric(elite_undergrad_ivyplus))) +
geom_smooth(method = "loess") +
labs(y = "Ivy League", x = "Year")
fj.plt <- journo.years %>%
ggplot(aes(x = year, y = as.numeric(field.journo))) +
labs(y = "Journalism Degree", x = "Year") +
geom_smooth(method = "loess")
pg.plt <- journo.years %>%
ggplot(aes(x = year, y = as.numeric(edu.has_postgrad))) +
labs(y = "Postgraduate Degree", x = "Year") +
geom_smooth(method = "loess")
gender.plt <- journo.years %>%
ggplot(aes(x = year, y = as.numeric(gender == "female"))) +
labs(y = "Female", x = "Year") +
geom_smooth(method = "loess")
race.plt <- journo.years %>%
ggplot(aes(x = year, y = as.numeric(race.nonwhite))) +
labs(y = "Nonwhite", x = "Year") +
geom_smooth(method = "loess")
plts <- list(age.plt, ivy.plt, fj.plt, gender.plt, pg.plt, race.plt) %>%
lapply(function(p) {
p +
theme_bw() +
theme(panel.grid = element_blank()) +
scale_x_continuous(breaks = seq(2012, 2022, 2))
})
ggarrange(plotlist = plts)

ggsave(here("paper/figures/journo-agg-2.png"), width = 10, height = 6)
Ideological Sources
sources %>%
group_by(year, category.slant) %>%
summarize(n = n()) %>%
ggplot(aes(x = year, y = n, color = category.slant, lty = category.slant)) +
geom_smooth(se = F) +
theme_min

sources_per_year <- sources %>%
group_by(year) %>%
summarize(year.n = n())
sources %>%
group_by(year, category.slant) %>%
summarize(n = n()) %>%
left_join(sources_per_year) %>%
ggplot(aes(x = year, y = n / year.n, color = category.slant, lty = category.slant)) +
geom_line() +
theme_min

Coverage/data quality
Articles with DIME and journalist information
articles.b <- sources %>% calculate_balance(types = c("org_category", "cfscore"))
articles.with.author <- articles.b %>% filter(!is.na(author))
articles.with.author.cfscore <- articles.with.author %>% filter(!is.na(cfscore))
nrow(articles.with.author)
## [1] 12348
nrow(articles.with.author.cfscore)
## [1] 9915
DIME coverage
nrow(sources %>% filter(!is.na(cfscore))) / nrow(sources)
## [1] 0.3082604
nrow(sources %>% filter(!is.na(cfscore.impute))) / nrow(sources)
## [1] 0.5452656
table(
!is.na(sources$cfscore),
sources$category.slant
)
##
## Academic Advocacy Bureaucrat Business Democrat Environmental
## FALSE 7506 2981 9635 3647 3658 7638
## TRUE 0 2384 0 2894 7549 4265
##
## Fossil Fuel International Media Other Politician Republican
## FALSE 1180 6829 4318 3955 1578 653
## TRUE 2422 46 0 695 0 3621
n_eligible <- sources %>%
filter(tolower(category.slant) %in% c("advocacy", "environmental", "fossil fuel", "business", "democrat", "republican")) %>%
nrow()
# Excludes bureaucrat, media, other
nrow(sources %>% filter(!is.na(cfscore))) / n_eligible
## [1] 0.5566539
nrow(sources %>% filter(!is.na(cfscore.impute))) / n_eligible
## [1] 0.9846358
Citation Topics are Correlated to Source Types, Politician Parties,
Ideology
list(
feols(src_topic == "Business" ~ category.slant, data = sources),
feols(src_topic == "Environment" ~ category.slant, data = sources),
feols(src_topic == "Policy" ~ category.slant, data = sources)
) %>%
etable() %>%
htmlTable()
|
|
|
model 1
|
model 2
|
model 3
|
|
1
|
Dependent Var.:
|
src_topic==“Business”
|
src_topic==“Environment”
|
src_topic==“Policy”
|
|
2
|
|
|
|
|
|
3
|
Constant
|
0.1106*** (0.0038)
|
0.3392*** (0.0044)
|
0.4301*** (0.0055)
|
|
4
|
category.slantAdvocacy
|
0.0410*** (0.0059)
|
-0.1800*** (0.0068)
|
0.1183*** (0.0085)
|
|
5
|
category.slantBureaucrat
|
0.0011 (0.0051)
|
-0.0952*** (0.0059)
|
0.1068*** (0.0073)
|
|
6
|
category.slantBusiness
|
0.2725*** (0.0056)
|
-0.2082*** (0.0065)
|
-0.0638*** (0.0080)
|
|
7
|
category.slantDemocrat
|
-0.0583*** (0.0049)
|
-0.2378*** (0.0057)
|
0.3641*** (0.0071)
|
|
8
|
category.slantEnvironmental
|
0.0445*** (0.0049)
|
-0.0652*** (0.0056)
|
0.0975*** (0.0070)
|
|
9
|
category.slantFossilFuel
|
0.3539*** (0.0067)
|
-0.2701*** (0.0077)
|
0.0025 (0.0096)
|
|
10
|
category.slantInternational
|
-0.0397*** (0.0055)
|
-0.1318*** (0.0064)
|
0.2108*** (0.0079)
|
|
11
|
category.slantMedia
|
-0.0325*** (0.0063)
|
-0.1852*** (0.0073)
|
-0.0308*** (0.0090)
|
|
12
|
category.slantOther
|
-0.0179** (0.0061)
|
-0.0994*** (0.0071)
|
-0.0017 (0.0088)
|
|
13
|
category.slantPolitician
|
-0.0656*** (0.0091)
|
-0.2961*** (0.0106)
|
0.4141*** (0.0131)
|
|
14
|
category.slantRepublican
|
-0.0535*** (0.0063)
|
-0.3067*** (0.0073)
|
0.4031*** (0.0091)
|
|
15
|
___________________________
|
_____________________
|
________________________
|
___________________
|
|
16
|
S.E. type
|
IID
|
IID
|
IID
|
|
17
|
Observations
|
77,454
|
77,454
|
77,454
|
|
18
|
R2
|
0.10390
|
0.05104
|
0.09159
|
|
19
|
Adj. R2
|
0.10378
|
0.05090
|
0.09146
|
list(
feols(src_topic == "Business" ~ env_category, data = sources),
feols(src_topic == "Environment" ~ env_category, data = sources),
feols(src_topic == "Policy" ~ env_category, data = sources)
) %>%
etable() %>%
htmlTable()
|
|
|
model 1
|
model 2
|
model 3
|
|
1
|
Dependent Var.:
|
src_topic==“Business”
|
src_topic==“Environment”
|
src_topic==“Policy”
|
|
2
|
|
|
|
|
|
3
|
Constant
|
0.1545*** (0.0036)
|
0.2713*** (0.0037)
|
0.5313*** (0.0045)
|
|
4
|
env_categoryfossilfuel
|
0.3066*** (0.0075)
|
-0.2028*** (0.0077)
|
-0.0953*** (0.0094)
|
|
5
|
______________________
|
_____________________
|
________________________
|
___________________
|
|
6
|
S.E. type
|
IID
|
IID
|
IID
|
|
7
|
Observations
|
15,818
|
15,818
|
15,818
|
|
8
|
R2
|
0.09579
|
0.04208
|
0.00647
|
|
9
|
Adj. R2
|
0.09573
|
0.04202
|
0.00641
|
list(
feols(src_topic == "Business" ~ pol_party, data = sources),
feols(src_topic == "Environment" ~ pol_party, data = sources),
feols(src_topic == "Policy" ~ pol_party, data = sources)
) %>%
etable() %>%
htmlTable()
|
|
|
model 1
|
model 2
|
model 3
|
|
1
|
Dependent Var.:
|
src_topic==“Business”
|
src_topic==“Environment”
|
src_topic==“Policy”
|
|
2
|
|
|
|
|
|
3
|
Constant
|
0.0519*** (0.0021)
|
0.0982*** (0.0025)
|
0.7978*** (0.0037)
|
|
4
|
pol_partyOther
|
0.0271*** (0.0058)
|
0.0115. (0.0069)
|
-0.0781*** (0.0100)
|
|
5
|
pol_partyRepublican
|
0.0035 (0.0040)
|
-0.0652*** (0.0048)
|
0.0377*** (0.0070)
|
|
6
|
___________________
|
_____________________
|
________________________
|
___________________
|
|
7
|
S.E. type
|
IID
|
IID
|
IID
|
|
8
|
Observations
|
18,122
|
18,122
|
18,122
|
|
9
|
R2
|
0.00122
|
0.01106
|
0.00601
|
|
10
|
Adj. R2
|
0.00111
|
0.01095
|
0.00590
|
list(
feols(src_topic == "Business" ~ cfscore, data = sources),
feols(src_topic == "Environment" ~ cfscore, data = sources),
feols(src_topic == "Policy" ~ cfscore, data = sources)
) %>%
etable() %>%
htmlTable()
|
|
|
model 1
|
model 2
|
model 3
|
|
1
|
Dependent Var.:
|
src_topic==“Business”
|
src_topic==“Environment”
|
src_topic==“Policy”
|
|
2
|
|
|
|
|
|
3
|
Constant
|
0.1655*** (0.0024)
|
0.1175*** (0.0022)
|
0.6534*** (0.0031)
|
|
4
|
cfscore
|
0.0485*** (0.0029)
|
-0.0491*** (0.0026)
|
-0.0078* (0.0038)
|
|
5
|
_______________
|
_____________________
|
________________________
|
___________________
|
|
6
|
S.E. type
|
IID
|
IID
|
IID
|
|
7
|
Observations
|
23,876
|
23,876
|
23,876
|
|
8
|
R2
|
0.01182
|
0.01472
|
0.00018
|
|
9
|
Adj. R2
|
0.01178
|
0.01467
|
0.00014
|
etable(feols(cfscore ~ src_topic, data = sources))
## feols(cfscore ~ s..
## Dependent Var.: cfscore
##
## Constant 0.0495*** (0.0132)
## src_topicEnvironment -0.4679*** (0.0198)
## src_topicOther -0.1150*** (0.0247)
## src_topicPolicy -0.2135*** (0.0147)
## ____________________ ___________________
## S.E. type IID
## Observations 23,876
## R2 0.02366
## Adj. R2 0.02353
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Non policy articles cite fewer ideological sources
all.sources$cfscore_std <- all.sources$cfscore - mean(all.sources$cfscore, na.rm = T)
all.sources.balance <- all.sources %>% calculate_balance(types = c("cfscore", "org_category"))
models <-
list(
feols(n_left ~ policy_label_gpt, data = all.sources.balance),
feols(n_right ~ policy_label_gpt, data = all.sources.balance)
)
etable(models) %>% htmlTable
|
|
|
model 1
|
model 2
|
|
1
|
Dependent Var.:
|
n_left
|
n_right
|
|
2
|
|
|
|
|
3
|
Constant
|
1.208*** (0.0117)
|
0.5336*** (0.0084)
|
|
4
|
_______________
|
_________________
|
__________________
|
|
5
|
S.E. type
|
IID
|
IID
|
|
6
|
Observations
|
13,136
|
13,136
|
END